
### Project: IADB Government Payroll Analytics - Country
### Project leader: Dr Christian Schuster
### Code author (s): Robert Lipiński
### Date last update: (run below)
file.info(rstudioapi::getActiveDocumentContext()$path)$mtime

### Script purpose: code promotions by grade, including overall and within-organization (allows to calculate those
### outside of an organization as a 'residue'), to director and rank promoted from

### Execution time: ~8 minutes

### Inputs: 
# 1) /data/intermediate/country_07_limpiar_cubertura.[format1]


### Outputs:
# 1) /data/intermediate/country_08_limpiar_ascensos.[format1]


#
# SET-UP --------------------------------------------------------------------------------------------
#

### Source the '00_global.R' script with required packages and functions
source(file.path(dirname(rstudioapi::getActiveDocumentContext()$path), '00_country_global.R'))


# library(installr)
# updateR()

# Make a copy of the file
file.copy(rstudioapi::getSourceEditorContext()$path,
          gsub('code', 'code/00_ARCHIVE', gsub('\\.R', ' - copy.R', rstudioapi::getSourceEditorContext()$path)),
          overwrite = T, copy.date = T)


### READ DATA -------------------------------------------------------------------------------------------------------------------


t0 = Sys.time() # record start time

# get columns used in this script
col_names = names(open_dataset(file.path(main_dir, 'data', 'intermediate', "country_07_limpiar_cubertura.parquet")))

col_select1  = col_names[sapply(col_names, function(c) any(grepl(c, tolower(readLines(rstudioapi::getSourceEditorContext()$path)))))]
col_select1

country_ascensos = read_flex(file.path(main_dir, 'data', 'intermediate', "country_07_limpiar_cubertura"), format = format1, col_select = col_select1)
country_ascensos = unique(country_ascensos)

## set as country_ascensos if not already done
if(!any(grepl('data.table', class(country_ascensos)))){setcountry_ascensos(country_ascensos)}
gc()


row1 = nrow(country_ascensos) %>% print
uniqueN(country_ascensos$row_id_org)



# ' ---------------------------------------------------------------------------------------
# PROMOTIONS ------------------------------------------------------------------------------
# 

# CHRISTIAN: administrativo - apoyo/assistente -> apoyo administrativo

funique(country_ascensos$tipo_estamento_comprimido)

# country_ascensos[, tipo_estamento_comprimido := fifelse(test = str_detect('auxiliar', tipo_estamento_comprimido),
#                                                       yes  = 'auxiliar o asistente', no = tipo_estamento_comprimido)]
rank_order <- c(
  "auxiliar o asistente" = 1,
  "administrativo" = 2,
  "tecnico" = 2.5,
  # "docente" = 3,
  # "medicos / personal de salud" = 3,
  "profesional" = 3,
  "directivo" = 4,
  # "alcalde" = NA,
  'estamento no definido' = NA # it can be anything - the mean(median) pay for that group is 1,080(710), so lower than any other, but is also has by far the highest SD (88,003), compared to 1,000-3,000 for all others (apart from directivos which is 14,600)
)


rank_df <- data.frame(
  tipo_estamento_comprimido = names(rank_order),
  rank_num = unname(rank_order),
  stringsAsFactors = FALSE
) %>% setDT()


### check -> pay distributions by rank
# tapply(country$pago_total/10^6, cifhile$tipo_estamento_comprimido, describe)
# tapply(country$pago_total/10^6, country$tipo_estamento_comprimido, function(x) round(quantile(x, probs = seq(0,1,.1)), 2))

# assign ranks as numbers
country_ascensos[, tipo_estamento_comprimido := sf_gsub(tipo_estamento_comprimido, "/", " o ", fixed=T)]
# sf(country_ascensos$tipo_estamento_comprimido)

country_ascensos = rank_df[country_ascensos, on = 'tipo_estamento_comprimido']





# define highest rank of each person in a month (controls for multi-month entries) both
# in general and within each organization a person might work in 
country_ascensos[, rank_num_max :=  max_miss(rank_num), by = .(person_id, anyo_mes)]

# only calculate if more than one organization per id-month, otherwise just assign rank_num_max
gc()
vars_group = c('anyo_mes', 'person_id', 'organismo_codigo')
temp = country_ascensos[, .N, by = vars_group][N > 1]
temp = temp[country_ascensos, on = vars_group, nomatch = 0] # works like dplyr inner_join
temp[, rank_num_max_org :=  max_miss(rank_num), by = .(person_id, anyo_mes, organismo_codigo)]
temp = temp[, .(row_id_org, rank_num_max_org)]

country_ascensos = temp[country_ascensos, on = 'row_id_org']


# if unique org, then assign same max rank
country_ascensos[, rank_num_max_org := fifelse(is.na(rank_num_max_org), rank_num_max, rank_num_max_org)]

table(country_ascensos$tipo_estamento_comprimido, country_ascensos$rank_num_max, useNA = 'ifany')
table(country_ascensos$rank_num_max, country_ascensos$rank_num_max_org, useNA = 'ifany')



### + promocion ------------------------------------------------------------------------------------------------
# order 
setorder(country_ascensos, person_id, anyo_mes)

# compute 1- and 2-month lag of rank_num
country_ascensos[, `:=`(
  rank_num_lag1 = shift(rank_num_max, 1, type = "lag"),
  rank_num_lag2 = shift(rank_num_max, 2, type = "lag")
), by = person_id]

### doing that with nafill() only really takes down NAs by a few percentage points
# country_ascensos[, `:=`(
#   rank_num_lag1 = nafill(shift(rank_num_max, type = "lag"), type = "locf"),
#   rank_num_lag2 = nafill(shift(rank_num_max, n = 2, type = "lag"), type = "locf")
# ), by = person_id]

### above is true because predominantly people either have defined or undefined rank in a given role, rarely
### there are both missing and non-missing rank observations per ID-cargo-org
temp = country_ascensos[,.(n=.N, na_count = sum(tipo_estamento_comprimido == 'estamento no definido')),
                      by = .(person_id, tipo_cargo_clean, organismo_nombre_clean)]

temp[, share := na_count/n]
temp$share %>% summary
temp$share %>% hist




### most recent change - current rank needs to be higher than in BOTH previous months -
### takes down annual % of promotions from ~2-2.5% to 1.5-1.9%
country_ascensos[, promocion := fifelse(
  !is.na(rank_num_max) & !is.na(rank_num_lag1) & rank_num_max > rank_num_lag1 & 
    !is.na(rank_num_max) & !is.na(rank_num_lag2) & !is.na(rank_num_lag2) & rank_num_max > rank_num_lag2,
  TRUE,FALSE
  )]



### (*)checks -> quick count of promotions in a year
temp = country_ascensos[, .(n=uniqueN(person_id)), by = .(promocion, anyo)] %>% 
  group_by(anyo) %>% mutate(share=n/sum(n)) %>% arrange(promocion, anyo)
temp


# to directivo?
country_ascensos[, promocion_directivo := fifelse(rank_num_max == 4 & promocion == T, T, F)]
  
pr_na(country_ascensos$promocion)
pr_na(country_ascensos$promocion_directivo)
table(country_ascensos$promocion, country_ascensos$promocion_directivo, useNA = 'ifany')


### + promocion_org ------------------------------------------------------------------------------------------------
country_ascensos[, `:=`(
  rank_num_org_lag1 = shift(rank_num_max_org, 1, type = "lag"),
  rank_num_org_lag2 = shift(rank_num_max_org, 2, type = "lag")
), by = .(person_id, organismo_codigo)]

country_ascensos[, promocion_org := fifelse(
  !is.na(rank_num_max_org) & !is.na(rank_num_org_lag1) & rank_num_max_org > rank_num_org_lag1 & 
    !is.na(rank_num_max_org) & !is.na(rank_num_org_lag2) & rank_num_max_org > rank_num_org_lag2,
  TRUE, FALSE
  )]


# to directivo?
country_ascensos[, promocion_directivo_org := fifelse(rank_num_max_org == 4 & promocion_org == T, T, F)]

pr(country_ascensos$promocion_org)
table(country_ascensos$promocion_directivo, country_ascensos$promocion_directivo_org, useNA = 'ifany')

dim(country_ascensos)


uniqueN(country_ascensos$row_id_org)





### <> tipo_estamento -> grado ----------------------------------------------------------------------------------------------------------------
country_ascensos = country_ascensos %>% rename(grado = tipo_estamento_comprimido)

### + grado_directivo ---------------------------------------------------------------------------------------------------------------------
country_ascensos[, grado_directivo := fifelse(grado == 'directivo', 1, 0)]


### + grado_dummy --------------------------------------------------------------------------------------------------------------------------
# create alternative columns with grado NAs if no grade is defined
country_ascensos[, grado_dummy := fifelse(grado == 'estamento no definido', NA, grado)]


### grade_anterior ---------------------------------------------------------------------------------------------------------------------------------
country_ascensos[, grado_anterior := fifelse(promocion, rank_num_lag1, NA)]

# turn back to name rather than number
country_ascensos[, grado_anterior := fcase(
  grado_anterior == 1, 'auxiliar o asistente',
  grado_anterior == 2, 'administrativo',
  grado_anterior == 2.5, 'tecnico',
  grado_anterior == 3, 'profesional',
  grado_anterior == 4, 'directivo')]




### <> by year ------------------------------------------------------------------------------------------------------------------------
country_ascensos[, `:=`(
    promocion_anyo       = fifelse(any(promocion == T), 'ascendido', 'no ascendido'),
    promocion_directivo_anyo   =  fifelse(any(promocion_directivo == T), 'ascendido', 'no ascendido'),
    
    promocion_anyo_org      = fifelse(any(promocion_org == T), 'ascendido', 'no ascendido'),
    promocion_directivo_anyo_org   =  fifelse(any(promocion_directivo_org == T), 'ascendido', 'no ascendido'),
    
    grado_directivo_anyo     = fifelse(any(grado_directivo == 1),  'directivo o gerente', 'no directivo o gerente')),
    
  by = .(anyo, person_id)]




### > save --------------------------------------------------------------------------------------------------------------------------------
country_save = country_ascensos %>% select(c(row_id_org, matches('^grado'), matches('_anyo')))

write_flex(x = country_save, file.path(main_dir, 'data', 'intermediate', "country_08_limpiar_ascensos"), format = format1)

beep('complete')
exec_time_fun('exec_time')




#
# FIN DEL CÓDIGO  ----------------------------------------------------------------------------------------------------------------------
# 